In [1]:
import os
import pandas as pd
import json
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
import nltk
import re

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import precision_score, recall_score
from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE
from sklearn.decomposition import LatentDirichletAllocation
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import cross_val_predict
from sklearn.linear_model import SGDClassifier

import json
import os
from collections import defaultdict

import numpy as np
from wordcloud import WordCloud
from matplotlib import pyplot as plt
import seaborn as sns

from langdetect import detect
from langdetect import DetectorFactory
import pickle

import spacy
from tqdm import tqdm 

from yellowbrick.cluster import KElbowVisualizer
from sklearn.cluster import KMeans
/Library/Frameworks/Python.framework/Versions/3.8/lib/python3.8/site-packages/sklearn/utils/deprecation.py:144: FutureWarning: The sklearn.metrics.classification module is  deprecated in version 0.22 and will be removed in version 0.24. The corresponding classes / functions should instead be imported from sklearn.metrics. Anything that cannot be imported from sklearn.metrics is now part of the private API.
  warnings.warn(message, FutureWarning)
In [2]:
data = pd.read_csv('data.csv')
data.shape
Out[2]:
(68905, 7)
In [3]:
data.columns
Out[3]:
Index(['paper_id', 'title', 'authors', 'abstract', 'body_text',
       'body_text_new', 'abstract_new'],
      dtype='object')
In [4]:
data.head(5)
Out[4]:
paper_id title authors abstract body_text body_text_new abstract_new
0 4fcb95cc0c4ea6d1fa4137a4a087715ed6b68cea end-tidal carbon dioxide levels during resusci... Kentaro Tamura, Emma E Williams, Theodore Dass... Abstract\n\nAbnormal levels of end-tidal carbo... Introduction\n\nImprovements in neonatal inten... improvement, neonatal, intensive care, decreas... abstract, abnormal, level, end-tidal carbon di...
1 86d4262de73cf81b5ea6aafb91630853248bff5f urban planning of the endoplasmic reticulum (e... Emily M Lynes, Thomas Simmen Abstract\n\nThe endoplasmic reticulum (ER) is ... Introduction\n\nThe endoplasmic reticulum (ER)... endoplasmic reticulum, multi-functional, organ... abstract, endoplasmic reticulum, organelle, ce...
2 b2f67d533f2749807f2537f3775b39da3b186051 caring for persons in detention suffering with... Michael Liebrenz, Dinesh Bhugra, Anna Buadze, ... NaN \n\nThere is a disproportionate number of indi... individual, mental, somatic illnesses, person,... NaN
3 9ec0b1175992879d5b8d3351ef40a28bb48f18ce NaN M I Garvey, M Biggs, V Reddy-Kolanu, H Flavell... NaN Seasonal respiratory virus testing in manageme... seasonal respiratory virus testing, management... NaN
4 86a998617c077f4fe2ab26214995a3548fbc0fc5 middle east respiratory syndrome and severe ac... Rahul Vijay, Stanley Perlman Abstract\n\nThe recent emergence of the Middle... Introduction\n\nWhile most CoVs cause the comm... covs, cold, human, infection, covs, sars-cov, ... abstract, recent, emergence, middle east respi...
In [5]:
data.describe()
Out[5]:
paper_id title authors abstract body_text body_text_new abstract_new
count 68905 61546 62094 47731 68905 68875 47731
unique 68905 58002 58041 46271 67661 67608 46067
top 235297ea9bb5c18917d273148a60189f6249107e journal pre-proof Xxx Xxx Xxx • Volume, Xxx Number Abstract\n\nThe purpose of this form is to pro... \n\nThe nuclear receptor heterodimers of liver... nuclear receptor heterodimers, liver x recepto... abstract
freq 1 104 74 43 69 69 102
In [6]:
data.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 68905 entries, 0 to 68904
Data columns (total 7 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   paper_id       68905 non-null  object
 1   title          61546 non-null  object
 2   authors        62094 non-null  object
 3   abstract       47731 non-null  object
 4   body_text      68905 non-null  object
 5   body_text_new  68875 non-null  object
 6   abstract_new   47731 non-null  object
dtypes: object(7)
memory usage: 3.7+ MB

Feature Analysis/Engineering

In [7]:
#Body text data that might be useful
data['digit_count'] = data['body_text'].apply(lambda x: len([x for x in x.split() if x.isdigit()]))
data['body_word_count'] = data['body_text'].apply(lambda x: len(x.strip().split()))  # word count in body
data['body_unique_words']= data['body_text'].apply(lambda x:len(set(x.strip().split()))) 
In [8]:
data.drop_duplicates(['abstract', 'body_text'], inplace=True)
data['abstract'].describe(include='all')
data.dropna(inplace=True)
In [9]:
#Checking languages

DetectorFactory.seed = 0

languages = []

for i in range(0,len(data)):
    # split by space into list, take the first x intex, join with space
    text = data.iloc[i]['body_text'].split(" ")
    
    lang = "en"
    try:
        if len(text) > 50:
            lang = detect(" ".join(text[:50]))
        elif len(text) > 0:
            lang = detect(" ".join(text[:len(text)]))
    except Exception as e:
        all_words = set(text)
        try:
            lang = detect(" ".join(all_words))
        except Exception as e:
            
            try:
                # let's try to label it through the abstract then
                lang = detect(df.iloc[i]['abstract_summary'])
            except Exception as e:
                lang = "unknown"
                pass    
    languages.append(lang)
In [10]:
languages_dict = {}
for lang in set(languages):
    languages_dict[lang] = languages.count(lang)
    
data['language'] = languages
for key,value in languages_dict.items():
    print(str(key) +' : ' + str(value))
da : 1
id : 1
cy : 1
pt : 11
fr : 203
et : 1
es : 156
ca : 1
nl : 93
ru : 1
en : 43671
de : 239
it : 29
ko : 1
In [11]:
fig, ax = plt.subplots(figsize=(10,7))
fig.size = (40,20)
graph = plt.bar(range(len(languages_dict)), list(languages_dict.values()))
plt.xticks(range(len(languages_dict)), list(languages_dict.keys()))
#plt.pie(list(languages_dict.values()), labels=list(languages_dict.keys()))
ax.set_title("Languages dsitribution in Dataset")
ax.set_ylabel("Number of Articles")
def autolabel(rects):
    """Attach a text label above each bar in *rects*, displaying its height."""
    for rect in rects:
        height = rect.get_height()
        ax.annotate('{}'.format(height),
                    xy=(rect.get_x() + rect.get_width() / 2, height),
                    xytext=(0, 3),  # 3 points vertical offset
                    textcoords="offset points",
                    ha='center', va='bottom')

autolabel(graph)
plt.savefig("language_distribution.png")
plt.show()
In [12]:
#let's drop all non-english languages!
data = data[data['language'] == 'en']
data.info()
<class 'pandas.core.frame.DataFrame'>
Int64Index: 43671 entries, 0 to 68902
Data columns (total 11 columns):
 #   Column             Non-Null Count  Dtype 
---  ------             --------------  ----- 
 0   paper_id           43671 non-null  object
 1   title              43671 non-null  object
 2   authors            43671 non-null  object
 3   abstract           43671 non-null  object
 4   body_text          43671 non-null  object
 5   body_text_new      43671 non-null  object
 6   abstract_new       43671 non-null  object
 7   digit_count        43671 non-null  int64 
 8   body_word_count    43671 non-null  int64 
 9   body_unique_words  43671 non-null  int64 
 10  language           43671 non-null  object
dtypes: int64(3), object(8)
memory usage: 4.0+ MB
In [13]:
sum(data['body_unique_words'])/len(data['body_unique_words'])
Out[13]:
1393.4724187676031
In [14]:
sum(data['body_word_count'])/len(data['body_word_count'])
Out[14]:
4592.339859403265
In [15]:
sns.set(rc={'figure.figsize':(8,6)})
ax = sns.distplot(data['body_unique_words'],150)
max_x = 12000
ax.set_xlim(0,max_x)
ax.set_xticks(range(0,max_x,1500))
plt.title("Unique Word Count Distribution")
ax.set_ylabel("Frequency")
ax.set_xlabel('Number of Unique Words')
plt.savefig("unique_words.png")
In [16]:
sns.set(rc={'figure.figsize':(8,6)})
ax = sns.distplot(data['body_word_count'],300)
max_x = 30000
ax.set_xlim(0,max_x)
ax.set_xticks(range(0,max_x,2500))
ax.set_xlabel("Word Count")
ax.set_ylabel("Frequency")
plt.title("Word Count Distribution")
plt.savefig("word_count.png")
Let's add tags to papers with specific risk factors
In [17]:
factors = {
    'alcohol':['alcohol','drinking','toxic','drink'],
    'heart':['heart disease','heart failure','blood pressure','hypertension'],
    'gender':['gender','sex','male','female'],
    'weight':['obese','overweight','weight'],
    'lung':['smoke','smoking','cigarettes','tabacco','lung','respitory','athsma'],
    'age':['age','elderly','senior','adult','dementia']
}
# risk['tags'] = 0
f = ['alcohol','heart','gender','weight','lung','age']

def tag(df,d):
    for factor,topics in d.items():
        for topic in topics:
            df[factor] = np.nan
            df[factor] = np.where(df['body_text_new'].str.contains(topic, case=False, na=False), 1, '')
# def tag(df,d):
#     for index, row in df.iterrows():
#         tags = []
#         for factor,words in factors.items():
#             for word in words:
#                 if word in row['body_text_new']:
#                     risk.loc[index,factor] = 1
#                     if factor not in tags:
#                         tags.append(factor)
#         risk.loc[index,'tags'] = [tags]
    return df

data = tag(data,factors)
In [18]:
data = data.fillna(0)
data.head(5)
Out[18]:
paper_id title authors abstract body_text body_text_new abstract_new digit_count body_word_count body_unique_words language alcohol heart gender weight lung age
0 4fcb95cc0c4ea6d1fa4137a4a087715ed6b68cea end-tidal carbon dioxide levels during resusci... Kentaro Tamura, Emma E Williams, Theodore Dass... Abstract\n\nAbnormal levels of end-tidal carbo... Introduction\n\nImprovements in neonatal inten... improvement, neonatal, intensive care, decreas... abstract, abnormal, level, end-tidal carbon di... 105 2615 846 en 1
1 86d4262de73cf81b5ea6aafb91630853248bff5f urban planning of the endoplasmic reticulum (e... Emily M Lynes, Thomas Simmen Abstract\n\nThe endoplasmic reticulum (ER) is ... Introduction\n\nThe endoplasmic reticulum (ER)... endoplasmic reticulum, multi-functional, organ... abstract, endoplasmic reticulum, organelle, ce... 24 8089 2295 en
4 86a998617c077f4fe2ab26214995a3548fbc0fc5 middle east respiratory syndrome and severe ac... Rahul Vijay, Stanley Perlman Abstract\n\nThe recent emergence of the Middle... Introduction\n\nWhile most CoVs cause the comm... covs, cold, human, infection, covs, sars-cov, ... abstract, recent, emergence, middle east respi... 21 2485 1053 en
5 948aaeb2e0be11ad90562bf10d462531a1f00eac integrated, multi-cohort analysis identifies c... Marta Andres-Terre, Helen M Mcguire, Yannick P... Abstract\n\nGraphical Abstract Highlights d MV... In Brief\n\nClinically relevant respiratory vi... brief, clinically, respiratory viral signature... abstract, graphical abstract, transcriptional,... 142 7115 1826 en 1
7 306ef95a3a91e13a93bcc37fb2c509b67c0b5640 a novel approach for a novel pathogen: using a... Chloe Bryson-Cahn, Jeffrey Duchin, Vanessa A M... Abstract\n\nThousands of people in the United ... Introduction\n\nThe 2019 novel coronavirus (SA... novel, coronavirus, sars-cov-2, outbreak, resp... abstract, people, testing, sars-cov-2, evaluat... 20 937 486 en 1
In [19]:
risk = pd.melt(data,
               value_vars = f,
               var_name = 'tag',
               value_name = 'count')
risk = risk[risk['count'] == '1']
risk.head()
Out[19]:
tag count
4 alcohol 1
34 alcohol 1
43 alcohol 1
80 alcohol 1
144 alcohol 1
In [20]:
sns.set(rc={'figure.figsize':(8,6)})
ax = sns.countplot(data=risk,x='tag')
# ax.set_ylim(43670,43675)
plt.title("Risk Tags")
ax.set_ylabel("Frequency")
ax.set_xlabel('Tags')
plt.savefig("tags.png")
In [21]:
risk_words = ['risk','estimat','characteristic','factors','features','study','predict','clinic']

def risk_tag(df,l):
    for item in l:
        df['risk'] = np.where(df['title'].str.contains(item, case=False, na=False), 1, '')
    return df
In [22]:
data['risk'] = np.nan
data = risk_tag(data,risk_words)
data = data.fillna(0)

data.head(5)
Out[22]:
paper_id title authors abstract body_text body_text_new abstract_new digit_count body_word_count body_unique_words language alcohol heart gender weight lung age risk
0 4fcb95cc0c4ea6d1fa4137a4a087715ed6b68cea end-tidal carbon dioxide levels during resusci... Kentaro Tamura, Emma E Williams, Theodore Dass... Abstract\n\nAbnormal levels of end-tidal carbo... Introduction\n\nImprovements in neonatal inten... improvement, neonatal, intensive care, decreas... abstract, abnormal, level, end-tidal carbon di... 105 2615 846 en 1
1 86d4262de73cf81b5ea6aafb91630853248bff5f urban planning of the endoplasmic reticulum (e... Emily M Lynes, Thomas Simmen Abstract\n\nThe endoplasmic reticulum (ER) is ... Introduction\n\nThe endoplasmic reticulum (ER)... endoplasmic reticulum, multi-functional, organ... abstract, endoplasmic reticulum, organelle, ce... 24 8089 2295 en
4 86a998617c077f4fe2ab26214995a3548fbc0fc5 middle east respiratory syndrome and severe ac... Rahul Vijay, Stanley Perlman Abstract\n\nThe recent emergence of the Middle... Introduction\n\nWhile most CoVs cause the comm... covs, cold, human, infection, covs, sars-cov, ... abstract, recent, emergence, middle east respi... 21 2485 1053 en
5 948aaeb2e0be11ad90562bf10d462531a1f00eac integrated, multi-cohort analysis identifies c... Marta Andres-Terre, Helen M Mcguire, Yannick P... Abstract\n\nGraphical Abstract Highlights d MV... In Brief\n\nClinically relevant respiratory vi... brief, clinically, respiratory viral signature... abstract, graphical abstract, transcriptional,... 142 7115 1826 en 1
7 306ef95a3a91e13a93bcc37fb2c509b67c0b5640 a novel approach for a novel pathogen: using a... Chloe Bryson-Cahn, Jeffrey Duchin, Vanessa A M... Abstract\n\nThousands of people in the United ... Introduction\n\nThe 2019 novel coronavirus (SA... novel, coronavirus, sars-cov-2, outbreak, resp... abstract, people, testing, sars-cov-2, evaluat... 20 937 486 en 1
In [23]:
print('There are {} articles that are tagged as risk articles'.format(len(data[data['risk']=='1'])))
There are 1880 articles that are tagged as risk articles
In [24]:
# Thought this would be useful but it wasn't
quantile_95 = data['body_word_count'].quantile(0.95)
df_95 = data[data['body_word_count'] < quantile_95]
plt.figure(figsize=(12.8,6))
sns.distplot(df_95['body_word_count']).set_title('Text Body Word Count');
In [25]:
data.to_csv('features.csv',index=False)
In [26]:
data = pd.read_csv('features.csv')
data.head(5)
Out[26]:
paper_id title authors abstract body_text body_text_new abstract_new digit_count body_word_count body_unique_words language alcohol heart gender weight lung age risk
0 4fcb95cc0c4ea6d1fa4137a4a087715ed6b68cea end-tidal carbon dioxide levels during resusci... Kentaro Tamura, Emma E Williams, Theodore Dass... Abstract\n\nAbnormal levels of end-tidal carbo... Introduction\n\nImprovements in neonatal inten... improvement, neonatal, intensive care, decreas... abstract, abnormal, level, end-tidal carbon di... 105 2615 846 en NaN NaN NaN 1.0 NaN NaN NaN
1 86d4262de73cf81b5ea6aafb91630853248bff5f urban planning of the endoplasmic reticulum (e... Emily M Lynes, Thomas Simmen Abstract\n\nThe endoplasmic reticulum (ER) is ... Introduction\n\nThe endoplasmic reticulum (ER)... endoplasmic reticulum, multi-functional, organ... abstract, endoplasmic reticulum, organelle, ce... 24 8089 2295 en NaN NaN NaN NaN NaN NaN NaN
2 86a998617c077f4fe2ab26214995a3548fbc0fc5 middle east respiratory syndrome and severe ac... Rahul Vijay, Stanley Perlman Abstract\n\nThe recent emergence of the Middle... Introduction\n\nWhile most CoVs cause the comm... covs, cold, human, infection, covs, sars-cov, ... abstract, recent, emergence, middle east respi... 21 2485 1053 en NaN NaN NaN NaN NaN NaN NaN
3 948aaeb2e0be11ad90562bf10d462531a1f00eac integrated, multi-cohort analysis identifies c... Marta Andres-Terre, Helen M Mcguire, Yannick P... Abstract\n\nGraphical Abstract Highlights d MV... In Brief\n\nClinically relevant respiratory vi... brief, clinically, respiratory viral signature... abstract, graphical abstract, transcriptional,... 142 7115 1826 en NaN NaN 1.0 NaN NaN NaN NaN
4 306ef95a3a91e13a93bcc37fb2c509b67c0b5640 a novel approach for a novel pathogen: using a... Chloe Bryson-Cahn, Jeffrey Duchin, Vanessa A M... Abstract\n\nThousands of people in the United ... Introduction\n\nThe 2019 novel coronavirus (SA... novel, coronavirus, sars-cov-2, outbreak, resp... abstract, people, testing, sars-cov-2, evaluat... 20 937 486 en 1.0 NaN NaN NaN NaN NaN NaN

Labeling the Data

In [27]:
def vectorize(text, maxfeatures):
    vectorizer = TfidfVectorizer(max_df=0.1,
                                 min_df = 0.05,
                                 ngram_range = (1,2),
                                 max_features=maxfeatures)
    X = vectorizer.fit_transform(text)
    return X
In [28]:
text = data['body_text_new'].values
X = vectorize(text, 2 ** 10)
X.shape
Out[28]:
(43671, 1024)
In [29]:
#We obviously want to reduce our dimensionality
#We will do this using PCA to hopefully make kmeans clustering a little easier

pca = PCA(n_components=0.9)
X_reduced= pca.fit_transform(X.toarray())
X_reduced.shape
Out[29]:
(43671, 673)
In [7]:
#So before we run kmeans we need to find what k we should use
#We'll compute the distortion value (distance from center of cluster)

# run kmeans with many different k
# distortions = []
# K = range(4, 40)
# for k in K:
#     k_means = KMeans(n_clusters=k).fit(X_reduced)
#     distortions.append(k_means.inertia_)

k_means = KMeans()
visualizer = KElbowVisualizer(k_means, k=(4,30))

visualizer.fit(X_reduced)        # Fit the data to the visualizer
visualizer.show()
# visualizer.savefig("elbow.png")
# X_line = [K[0], K[-1]]
# Y_line = [distortions[0], distortions[-1]]

# # Plot the elbow
# plt.plot(K, distortions, 'r')
# plt.plot(X_line, Y_line, 'b')
# plt.xlabel('k')
# plt.ylabel('Distortion')
# plt.title('The Elbow Method showing the optimal k')
# plt.show()
Out[7]:
<matplotlib.axes._subplots.AxesSubplot at 0x28fcd6c70>
In [30]:
#So it looks like a good k is 17! 
#Let's run kmeans with 17
k = 17
kmeans = KMeans(n_clusters=k,random_state=0)
y_predictions = kmeans.fit_predict(X_reduced)
data['y_predictions'] = y_predictions
print('Done')
Done
In [56]:
tsne = TSNE(verbose=1, perplexity=50)
X_embedded = tsne.fit_transform(X.toarray())
[t-SNE] Computing 151 nearest neighbors...
[t-SNE] Indexed 43671 samples in 18.620s...
[t-SNE] Computed neighbors for 43671 samples in 3023.536s...
[t-SNE] Computed conditional probabilities for sample 1000 / 43671
[t-SNE] Computed conditional probabilities for sample 2000 / 43671
[t-SNE] Computed conditional probabilities for sample 3000 / 43671
[t-SNE] Computed conditional probabilities for sample 4000 / 43671
[t-SNE] Computed conditional probabilities for sample 5000 / 43671
[t-SNE] Computed conditional probabilities for sample 6000 / 43671
[t-SNE] Computed conditional probabilities for sample 7000 / 43671
[t-SNE] Computed conditional probabilities for sample 8000 / 43671
[t-SNE] Computed conditional probabilities for sample 9000 / 43671
[t-SNE] Computed conditional probabilities for sample 10000 / 43671
[t-SNE] Computed conditional probabilities for sample 11000 / 43671
[t-SNE] Computed conditional probabilities for sample 12000 / 43671
[t-SNE] Computed conditional probabilities for sample 13000 / 43671
[t-SNE] Computed conditional probabilities for sample 14000 / 43671
[t-SNE] Computed conditional probabilities for sample 15000 / 43671
[t-SNE] Computed conditional probabilities for sample 16000 / 43671
[t-SNE] Computed conditional probabilities for sample 17000 / 43671
[t-SNE] Computed conditional probabilities for sample 18000 / 43671
[t-SNE] Computed conditional probabilities for sample 19000 / 43671
[t-SNE] Computed conditional probabilities for sample 20000 / 43671
[t-SNE] Computed conditional probabilities for sample 21000 / 43671
[t-SNE] Computed conditional probabilities for sample 22000 / 43671
[t-SNE] Computed conditional probabilities for sample 23000 / 43671
[t-SNE] Computed conditional probabilities for sample 24000 / 43671
[t-SNE] Computed conditional probabilities for sample 25000 / 43671
[t-SNE] Computed conditional probabilities for sample 26000 / 43671
[t-SNE] Computed conditional probabilities for sample 27000 / 43671
[t-SNE] Computed conditional probabilities for sample 28000 / 43671
[t-SNE] Computed conditional probabilities for sample 29000 / 43671
[t-SNE] Computed conditional probabilities for sample 30000 / 43671
[t-SNE] Computed conditional probabilities for sample 31000 / 43671
[t-SNE] Computed conditional probabilities for sample 32000 / 43671
[t-SNE] Computed conditional probabilities for sample 33000 / 43671
[t-SNE] Computed conditional probabilities for sample 34000 / 43671
[t-SNE] Computed conditional probabilities for sample 35000 / 43671
[t-SNE] Computed conditional probabilities for sample 36000 / 43671
[t-SNE] Computed conditional probabilities for sample 37000 / 43671
[t-SNE] Computed conditional probabilities for sample 38000 / 43671
[t-SNE] Computed conditional probabilities for sample 39000 / 43671
[t-SNE] Computed conditional probabilities for sample 40000 / 43671
[t-SNE] Computed conditional probabilities for sample 41000 / 43671
[t-SNE] Computed conditional probabilities for sample 42000 / 43671
[t-SNE] Computed conditional probabilities for sample 43000 / 43671
[t-SNE] Computed conditional probabilities for sample 43671 / 43671
[t-SNE] Mean sigma: 0.295053
[t-SNE] KL divergence after 250 iterations with early exaggeration: 111.766785
[t-SNE] KL divergence after 1000 iterations: 2.680552
In [57]:
sns.set(rc={'figure.figsize':(15,15)})
palette = sns.hls_palette(k, l=.5, s=.9)

sns.scatterplot(X_embedded[:,0], X_embedded[:,1], hue=y_predictions, legend='full', palette=palette)
plt.title('T-SNE with Kmeans Labels')
plt.savefig("improved_cluster_tsne.png")
plt.show()
In [31]:
vectorizers = []
    
for i in range(k):
    # Creating a vectorizer
    vectorizers.append(CountVectorizer(min_df=5, max_df=0.9,
                                       stop_words='english',
                                       lowercase=True,
                                       token_pattern='[a-zA-Z\-][a-zA-Z\-]{2,}'))
In [32]:
vectorized_data = []

for cluster,vectorizer in enumerate(vectorizers):
    vectorized_data.append(vectorizer.fit_transform(data.loc[data['y_predictions'] == cluster, 'body_text_new']))
In [33]:
num_topics = 5

lda_models = []
for num in range(k):
    lda = LatentDirichletAllocation(n_components=num_topics,
                                    learning_method='online',
                                    verbose=False)
    lda_models.append(lda)
In [34]:
clusters_lda_data = []

for cluster, lda in enumerate(lda_models):    
    clusters_lda_data.append((lda.fit(vectorized_data[cluster])))
In [35]:
def print_topics(model, count_vectorizer, n_top_words=10):
    top_topics = []
    words = count_vectorizer.get_feature_names()
    for topic_idx, topic in enumerate(model.components_):
        print("\nTopic #%d:" % topic_idx)
        topic = " ".join([words[i] for i in topic.argsort()[:-n_top_words - 1:-1]])
        top_topics.append(topic)
        print(topic)
        
    return top_topics
In [36]:
def selected_topics(model, vectorizer, top_n=4):
    current_words = []
    keywords = []
    
    for idx, topic in enumerate(model.components_):
        words = [(vectorizer.get_feature_names()[i], topic[i]) for i in topic.argsort()[:-top_n - 1:-1]]
        for word in words:
            if word[0] not in current_words:
                keywords.append(word)
                current_words.append(word[0])
                
    keywords.sort(key = lambda x: x[1])  
    keywords.reverse()
    return_values = []
    for ii in keywords:
        return_values.append(ii[0])
    return return_values
In [37]:
all_keywords = []
for current_vectorizer, lda in enumerate(lda_models):
    all_keywords.append(selected_topics(lda, vectorizers[current_vectorizer]))
In [38]:
#Let's write these words to a file
with open('topics.txt','w') as fout:
    for topic_num, words in enumerate(all_keywords):
        fout.write('Topic ' + str(topic_num) + ':\n' + ', '.join(words) + '\n\n')
        print(('Topic ' + str(topic_num) + ':\n' + ', '.join(words) + '\n'))
Topic 0:
cells, sequence, cell, genome, protein, sequences, infection, bats, viral, population, host, gene, specie, covs, cov, genes, alecto

Topic 1:
covid-, surgery, group, treatment, mask, case, imaging, lung, hospital, infection, study, risk, surgical, care, ppe, respiratory, influenza

Topic 2:
individual, epidemic, population, rate, parameter, testing, social, contact, node, network, spatial, algorithm, graph, model, air, study

Topic 3:
rna, hcv, compound, viral, virus, sequence, membrane, activity, structure, codon, binding, gene, cell, cells, peptide

Topic 4:
cells, ang, expression, patient, binding, covid-, sars-cov-, cell, rbd, sars-cov, gene, lung, activity, viral, mouse, infection

Topic 5:
infection, gene, nsp, sequence, assay, viral, infected, expression, proteins, membrane, rna, ifitm, mouse

Topic 6:
cat, infection, dog, fip, virus, gene, feline, strain, case, clinical, concentration, animal, human, treatment, population

Topic 7:
protein, rna, gene, virus, activity, expression, vaccine, production, acid, specie, extract, compound, pathogen, group, human, host, authority, author

Topic 8:
virus, cells, viral, gene, mouse, sequence, protein, strain, study, vaccine, genome, case, patient, concentration, infection, surface, cell

Topic 9:
patient, protein, camel, human, mouse, sars-cov, vaccine, case, cells, binding, transmission, mers, antibody, outbreak, viral, rna, lung

Topic 10:
health, patient, disease, care, case, public, outbreak, research, data, social, information, user, study, market, model, result

Topic 11:
treatment, milk, infection, colostrum, concentration, strain, sample, herd, antibody, diarrhea, cells, bcov, serum, bovine, gene, cow, cattle, production, beef

Topic 12:
pedv, pig, cells, piglet, strain, sequence, farm, group, animal, gene, infection, cell, intestinal, rna, disease, protein, patient, mucosal, immune

Topic 13:
influenza, asthma, viral, patient, detection, pneumonia, pcr, child, infant, year, bacterial, antibiotic, airway, age, cells, bronchiolitis

Topic 14:
patient, case, covid-, day, model, rate, population, transmission, infected, sars-cov-, health, clinical, risk, sample, viral, study

Topic 15:
cells, lung, infection, covid-, il-, group, donor, cell, blood, treatment, mortality, day, case, care, year, method

Topic 16:
epitope, il-, mouse, peptide, antibody, infection, virus, binding, protein, patient, vaccine, viral, cns, expression, mice

In [39]:
sns.set(rc={'figure.figsize':(8,6)})
ax = sns.distplot(data['y_predictions'],150)
max_x = k
ax.set_xlim(0,max_x)
ax.set_xticks(range(0,max_x,1))
plt.title("Cluster Distribution")
ax.set_ylabel("Frequency")
ax.set_xlabel('Cluster Number')
plt.savefig("cluster_dist.png")
In [40]:
#Let's save all the files that took forever to run
data.to_csv('data1.csv',index=False)

pickle.dump(y_predictions, open("y_predictions.p", "wb" ))

Model Selection and Experimentation

In [41]:
data = pd.read_csv('data1.csv')
data.head()
Out[41]:
paper_id title authors abstract body_text body_text_new abstract_new digit_count body_word_count body_unique_words language alcohol heart gender weight lung age risk y_predictions
0 4fcb95cc0c4ea6d1fa4137a4a087715ed6b68cea end-tidal carbon dioxide levels during resusci... Kentaro Tamura, Emma E Williams, Theodore Dass... Abstract\n\nAbnormal levels of end-tidal carbo... Introduction\n\nImprovements in neonatal inten... improvement, neonatal, intensive care, decreas... abstract, abnormal, level, end-tidal carbon di... 105 2615 846 en NaN NaN NaN 1.0 NaN NaN NaN 13
1 86d4262de73cf81b5ea6aafb91630853248bff5f urban planning of the endoplasmic reticulum (e... Emily M Lynes, Thomas Simmen Abstract\n\nThe endoplasmic reticulum (ER) is ... Introduction\n\nThe endoplasmic reticulum (ER)... endoplasmic reticulum, multi-functional, organ... abstract, endoplasmic reticulum, organelle, ce... 24 8089 2295 en NaN NaN NaN NaN NaN NaN NaN 8
2 86a998617c077f4fe2ab26214995a3548fbc0fc5 middle east respiratory syndrome and severe ac... Rahul Vijay, Stanley Perlman Abstract\n\nThe recent emergence of the Middle... Introduction\n\nWhile most CoVs cause the comm... covs, cold, human, infection, covs, sars-cov, ... abstract, recent, emergence, middle east respi... 21 2485 1053 en NaN NaN NaN NaN NaN NaN NaN 9
3 948aaeb2e0be11ad90562bf10d462531a1f00eac integrated, multi-cohort analysis identifies c... Marta Andres-Terre, Helen M Mcguire, Yannick P... Abstract\n\nGraphical Abstract Highlights d MV... In Brief\n\nClinically relevant respiratory vi... brief, clinically, respiratory viral signature... abstract, graphical abstract, transcriptional,... 142 7115 1826 en NaN NaN 1.0 NaN NaN NaN NaN 8
4 306ef95a3a91e13a93bcc37fb2c509b67c0b5640 a novel approach for a novel pathogen: using a... Chloe Bryson-Cahn, Jeffrey Duchin, Vanessa A M... Abstract\n\nThousands of people in the United ... Introduction\n\nThe 2019 novel coronavirus (SA... novel, coronavirus, sars-cov-2, outbreak, resp... abstract, people, testing, sars-cov-2, evaluat... 20 937 486 en 1.0 NaN NaN NaN NaN NaN NaN 10
In [42]:
#Let's manutally look at the topics and assign them overarching topics
y_topics = ['coronavirus genome',
            'clinical treatment',
            'transmission simulation',
            'virus rna',
           'cell binding',
            'virus sampling',
           'animals and viruses',
            'vaccine compounds',
           'infections',
            'virus detection',
           'risk factors',
            'similar diseases',
           'animal transmission',
            'children and athsma'
            'death rates',
           'lung issues',
           'testing']

#For risk factors, the revelant clusters are 3, but with greater focus on 13
In [43]:
#So the articles that talk about risk are y_predictions == 13
#Let's first look at all the articles with specific words associated with risk included 
#in the title to check which clusters we should do further analysis on
risk_words = ['risk','estimat','characteristic','factors','features','study','predict','clinic']


def contains(df,word):
    return df[df['title'].str.contains(word)]

risk_data = pd.DataFrame()
for word in risk_words:
    out = contains(data,word)
    p = [risk_data,out]
    risk_data = pd.concat(p,join='outer')

# new_data.head(5)

# len(risk_cluster)
# risk_cluster = data[data['y_predictions']==10]
In [44]:
len(risk_data)
Out[44]:
7507
In [45]:
sns.set(rc={'figure.figsize':(8,6)})
ax = sns.distplot(risk_data['y_predictions'],150)
max_x = k
ax.set_xlim(0,max_x)
ax.set_xticks(range(0,max_x,1))
plt.title("Clusters Containing Risk Information")
ax.set_ylabel("Frequency")
ax.set_xlabel('Cluster Number')
plt.savefig("risk_cluster_dist.png")
In [65]:
def images(df,col,match):
    print(col)
    tt = df[df[col]==match]
    v = TfidfVectorizer(max_df=0.1, min_df = 0.05,max_features = 10000,ngram_range = (1,2))
    x = v.fit_transform(tt['body_text_new'])
    freqs = dict()
    for word, idx in v.vocabulary_.items():
        freqs[word] = x.getcol(idx).sum()
    w = WordCloud(width=800,height=600,mode='RGBA',background_color='white',max_words=2000).fit_words(freqs)
    plt.figure(figsize=(20,10))
    plt.imshow(w)
    plt.savefig(col + ".png")
In [67]:
for word in f:
    try:
        images(data,word,1)
    except ValueError:
        pass
alcohol
heart
gender
weight
lung
age
Use the kmean clusters as a classifier model using a stochastic gradient descent classifier.
In [48]:
# function to print out classification model report
def report(model_name, test, pred):
    print(model_name, ":\n")
    print("Accuracy Score: ", '{:,.3f}'.format(float(accuracy_score(test, pred)) * 100), "%")
    print("     Precision: ", '{:,.3f}'.format(float(precision_score(test, pred, average='macro')) * 100), "%")
    print("        Recall: ", '{:,.3f}'.format(float(recall_score(test, pred, average='macro')) * 100), "%")
    print("      F1 score: ", '{:,.3f}'.format(float(f1_score(test, pred, average='macro')) * 100), "%")
In [49]:
X_train, X_test, y_train, y_test = train_test_split(X.toarray(),y_predictions, test_size=0.2)

print("X_train size:", len(X_train))
print("X_test size:", len(X_test), "\n")
X_train size: 34936
X_test size: 8735 

In [50]:
from sklearn.dummy import DummyClassifier

smp_clf = DummyClassifier(strategy="most_frequent")
# train SGD
smp_clf.fit(X_train, y_train)

# cross validation predictions
# sgd_pred = cross_val_predict(sgd_clf, X_train, y_train, cv=3, n_jobs=4)
smp_clf = cross_val_predict(smp_clf, X_train, y_train, cv=3, n_jobs=4)

# print out the classification report
report("Majority Classifier (Training Set)", y_train, smp_clf)

# SGD instance
from sklearn.naive_bayes import ComplementNB

# sgd_clf = SGDClassifier(max_iter=1000, random_state=0)
cnb_clf = ComplementNB()
# train SGD
cnb_clf.fit(X_train, y_train)

# cross validation predictions
# sgd_pred = cross_val_predict(sgd_clf, X_train, y_train, cv=3, n_jobs=4)
cnb_clf = cross_val_predict(cnb_clf, X_train, y_train, cv=3, n_jobs=4)

# print out the classification report
report("Complement Naive Bayes (Training Set)", y_train, cnb_clf)
Majority Classifier (Training Set) :

Accuracy Score:  22.086 %
     Precision:  1.299 %
        Recall:  5.882 %
      F1 score:  2.128 %
/Library/Frameworks/Python.framework/Versions/3.8/lib/python3.8/site-packages/sklearn/metrics/_classification.py:1272: UndefinedMetricWarning: Precision is ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.
  _warn_prf(average, modifier, msg_start, len(result))
Complement Naive Bayes (Training Set) :

Accuracy Score:  81.077 %
     Precision:  87.312 %
        Recall:  81.666 %
      F1 score:  82.414 %
In [54]:
# SGD instance
from sklearn.linear_model import SGDClassifier

sgd_clf = SGDClassifier(random_state=0)
# cnb_clf = ComplementNB()
# train SGD
sgd_clf.fit(X_train, y_train)

# cross validation predictions
# sgd_pred = cross_val_predict(sgd_clf, X_train, y_train, cv=3, n_jobs=4)
sgd_clf_p = cross_val_predict(sgd_clf, X_train, y_train, cv=3, n_jobs=4)

# print out the classification report
report("Stochastic Gradient Descent:", y_train, sgd_clf_p)
Stochastic Gradient Descent: :

Accuracy Score:  92.764 %
     Precision:  93.525 %
        Recall:  94.058 %
      F1 score:  93.759 %
In [55]:
pickle.dump(sgd_clf, open('sgd_clf.pkl','wb'))
In [ ]: